Wang Haihua
🚅 🚋😜 🚑 🚔
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('classic')
from sklearn.datasets import load_iris
iris = load_iris()
df_iris = pd.DataFrame(data=iris['data'],columns=iris['feature_names'])
df_iris['Type'] = iris['target']
df_iris.head()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | Type | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(iris['data'],iris['target'],test_size=0.3,random_state=1)
from sklearn.linear_model import LogisticRegression
logis = LogisticRegression()
logis.fit(X_train,y_train)
logis.score(X_test,y_test)
0.9777777777777777
logis.predict((np.array(test_data)).reshape(1,-1))
array([1])
logis.predict(X_test)
array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1])
y_test
array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2, 1])
from sklearn.metrics import confusion_matrix
import seaborn as sn
sn.heatmap(confusion_matrix(y_test,logis.predict(X_test)),annot=True)
<AxesSubplot:>
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
model_names = ['Logistic Regression','KNN','SVM','Decision Tree','Neural Network','Naive Bayes','Random Forest','AdaBoost']
models = [LogisticRegression(),KNeighborsClassifier(),SVC(),DecisionTreeClassifier(),MLPClassifier(max_iter=1000),GaussianNB(),RandomForestClassifier(),AdaBoostClassifier()]
score_list = []
for model in models:
model.fit(X_train,y_train)
score_list.append(model.score(X_test,y_test))
plt.figure(figsize=(20,7))
plt.barh(model_names,score_list)
<BarContainer object of 8 artists>
df_iris.groupby('Type').median().plot.bar()
<AxesSubplot:xlabel='Type'>
df_iris.groupby('Type').median()
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | |
---|---|---|---|---|
Type | ||||
0 | 5.0 | 3.4 | 1.50 | 0.2 |
1 | 5.9 | 2.8 | 4.35 | 1.3 |
2 | 6.5 | 3.0 | 5.55 | 2.0 |
df_iris.groupby('Type').median().T.plot.bar()
<AxesSubplot:>
from sklearn.cluster import KMeans
km = KMeans(n_clusters=3)
km.fit(X_train)
KMeans(n_clusters=3)
km.predict(X_test)
array([1, 2, 2, 1, 0, 2, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1, 2, 2, 0, 1, 0, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 1, 2, 1, 2, 0, 0, 1, 2, 0, 2])
df_km_train = pd.DataFrame(data=X_train,columns=iris['feature_names'])
df_km_train['Type'] = km.predict(X_train)
df_km_train['Train'] = 1
df_km_test = pd.DataFrame(data=X_test,columns=iris['feature_names'])
df_km_test['Type'] = km.predict(X_test)
df_km_test['Train'] = 0
df_km = pd.concat([df_km_train,df_km_test],axis=0)
df_km
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | Type | Train | |
---|---|---|---|---|---|---|
0 | 7.7 | 2.6 | 6.9 | 2.3 | 0 | 1 |
1 | 5.7 | 3.8 | 1.7 | 0.3 | 1 | 1 |
2 | 5.0 | 3.6 | 1.4 | 0.2 | 1 | 1 |
3 | 4.8 | 3.0 | 1.4 | 0.3 | 1 | 1 |
4 | 5.2 | 2.7 | 3.9 | 1.4 | 2 | 1 |
... | ... | ... | ... | ... | ... | ... |
40 | 6.8 | 3.0 | 5.5 | 2.1 | 0 | 0 |
41 | 5.1 | 3.5 | 1.4 | 0.3 | 1 | 0 |
42 | 6.0 | 2.2 | 5.0 | 1.5 | 2 | 0 |
43 | 6.3 | 2.9 | 5.6 | 1.8 | 0 | 0 |
44 | 6.6 | 2.9 | 4.6 | 1.3 | 2 | 0 |
150 rows × 6 columns
df_km.groupby(['Type','Train']).mean().iloc[:,:-1].plot.bar()
<AxesSubplot:xlabel='Type,Train'>
km_label = KMeans(n_clusters=3)
km_label.fit(X_train,y_train)
km_label.predict(X_test)
array([1, 0, 0, 1, 2, 0, 2, 1, 1, 2, 0, 1, 2, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 2, 0, 1, 1, 0, 0, 0, 2, 0, 2, 0, 1, 0, 1, 0, 2, 2, 1, 0, 2, 0])
km.predict(X_test)
array([1, 2, 2, 1, 0, 2, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 2, 2, 1, 1, 2, 2, 0, 1, 0, 2, 1, 1, 2, 2, 2, 0, 2, 0, 2, 1, 2, 1, 2, 0, 0, 1, 2, 0, 2])
import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.datasets import load_iris
from sklearn.cluster import AgglomerativeClustering
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack([model.children_, model.distances_,
counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
iris = load_iris()
X = iris.data
# setting distance_threshold=0 ensures we compute the full tree.
model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
model = model.fit(X)
plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode='level', p=6)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()
#show
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train)
PCA()
pca.explained_variance_
array([4.36909984, 0.22100548, 0.09049788, 0.0206056 ])
pca.explained_variance_ratio_
array([0.92935669, 0.04701035, 0.01924992, 0.00438304])
pca.components_
array([[ 0.36760197, -0.06649071, 0.85410056, 0.36188398], [ 0.63470116, 0.7468054 , -0.17634312, -0.09131939], [-0.58983791, 0.58428977, 0.06477864, 0.55362481], [-0.33780832, 0.31059 , 0.48499389, -0.74444632]])
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train_pca[:,:1],y_train)
LogisticRegression()
lr.score(X_test_pca[:,:1],y_test)
0.9777777777777777
df_pca = pd.DataFrame(X_train_pca[:,:2],columns=['Z1','Z2'])
df_pca['Type'] = y_train
df_pca
Z1 | Z2 | Type | |
---|---|---|---|
0 | 3.829677 | 0.219859 | 2 |
1 | -2.150407 | 0.946246 | 0 |
2 | -2.686849 | 0.414629 | 0 |
3 | -2.684287 | -0.169526 | 0 |
4 | 0.016025 | -0.680996 | 1 |
... | ... | ... | ... |
100 | 1.474847 | -0.128888 | 2 |
101 | 1.941866 | 0.060690 | 2 |
102 | 1.323975 | -0.317661 | 1 |
103 | 2.354687 | 0.178675 | 2 |
104 | -2.759798 | 0.360291 | 0 |
105 rows × 3 columns
import seaborn as sn
sn.scatterplot(x='Z1',y='Z2',hue='Type',data=df_pca)
<AxesSubplot:xlabel='Z1', ylabel='Z2'>
sn.scatterplot(x='sepal length (cm)',y='sepal width (cm)',hue='Type',data=df_iris)
<AxesSubplot:xlabel='sepal length (cm)', ylabel='sepal width (cm)'>
df_iris
sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | Type | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | 2 |
146 | 6.3 | 2.5 | 5.0 | 1.9 | 2 |
147 | 6.5 | 3.0 | 5.2 | 2.0 | 2 |
148 | 6.2 | 3.4 | 5.4 | 2.3 | 2 |
149 | 5.9 | 3.0 | 5.1 | 1.8 | 2 |
150 rows × 5 columns